# Hide Code Cells
from IPython.display import HTML
HTML('''
<script
src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js ">
</script>
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
} else {
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit"
value="Click here to toggle on/off the raw code."></form>
''')
Overall, the Lang_data dataset offers a valuable resource for studying and understanding the 14 Asian languages, enabling advancements in linguistic research, communication technologies, and language-specific applications.
| Abbreviation | Variable Name |
|---|---|
| AR | Arabic |
| FA | Persian |
| HI | Hindi |
| KN | Kannada |
| NE | Nepali |
| PA | Panjabi |
| SI | Sinhala |
| TA | Tamil |
| UR | Urdu |
| ID | Indonesian |
| MY | Malaysian |
| TH | Thai |
| JA | Japanese |
| ZH | Chinese |
The following code snippet imports necessary libraries and modules such as Torch, TorchAudio, Pandas, Matplotlib, and Pickling. It sets up the environment for data processing, model training, and visualization, facilitating efficient and comprehensive analysis.
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchaudio as ta
from torchaudio import transforms
from torch.utils.data import DataLoader, Dataset
import sys
import pytorch_lightning as pl
from transformers import AutoFeatureExtractor, ASTForAudioClassification, Wav2Vec2Model
from torch.optim import Adam
from torchvision import models
from utils.helper import get_targets_from_annotations
from utils.metrics import SingleLabelMetrics
from IPython.display import Audio
from pathlib import Path
from models.explainer import Deeplabv3Resnet50ExplainerModel
from models.classifier import VGG16ClassifierModel, Resnet50ClassifierModel
from models.explainer_salita import ExplainerClassifierModel
from utils.image_utils import save_mask, save_masked_image, save_all_class_masks
from utils.loss import TotalVariationConv, ClassMaskAreaLoss, entropy_loss
import pandas as pd
from pyjanitor import auto_toc
toc = auto_toc()
import os, re, shutil, copy, zipfile, glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
from pathlib import Path
from tqdm import tqdm, trange
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from pyjanitor import auto_toc
toc = auto_toc()
from pickling import *
There was a problem when trying to write in your cache folder (/home/mbalogal/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory. Matplotlib created a temporary config/cache directory at /tmp/matplotlib-7knnau74 because the default path (/home/mbalogal/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
# username = 'cvillarin'
username = 'mbalogal'
# username = 'vdelossantos'
# username = 'jfabrero'
os.environ['XDG_CACHE_HOME'] = f'/home/msds2023/{username}/.cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = f'/home/msds2023/{username}/.cache'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
cuda
zip_ = 'data.zip' # Replace with downloaded .zip from Kaggle
with zipfile.ZipFile(zip_, 'r') as zip_ref:
zip_ref.extractall('.')
classes = [x.split('/')[-1] for x in glob.glob('./data/*') if x[-3] == '/']
def rename_audio(class_):
"""Renames each audio in each class"""
for c in tqdm(class_):
path = os.path.join('./data', c)
for i, audio in enumerate(os.listdir(path)):
if audio.endswith('.wav'):
new_name = f'{c}_{i}.wav'
os.rename(os.path.join(path, audio),
os.path.join(path, new_name))
rename_audio(classes)
100%|██████████| 14/14 [00:00<00:00, 59.33it/s]
def create_dataset(src, dst, class_, dist=(.6,.2,.2), overwrite=False):
"""Copy images of class `class_` using `dist` from src to dst.
"""
if os.path.exists(dst) and overwrite:
shutil.rmtree(dst)
for c in tqdm(class_):
c_path = os.path.join(src, c)
n_data = len(os.listdir(c_path))
ns = list(map(lambda x: int(n_data*x), dist))
ns = [0]+[x+sum(ns[:i]) for i, x in enumerate(ns)]
for i, stage in enumerate(['train', 'validation', 'test']):
stage_path = os.path.join(dst, stage)
if not os.path.exists(stage_path):
os.makedirs(stage_path)
elif os.path.exists(stage_path) and overwrite == False:
continue
label_path = os.path.join(stage_path, c)
os.makedirs(label_path)
for j in range(ns[i],ns[i+1]):
fname = f'{c}_{j}.wav'
src_file = os.path.join(c_path, fname)
dst_file = os.path.join(label_path, fname)
try:
shutil.copyfile(src_file, dst_file)
except:
pass
for stage in ['train', 'validation', 'test']:
for c in class_:
label_path = os.path.join(os.path.join(dst, stage), c)
n_data = len(os.listdir(label_path))
print(f'Total {stage.title()} {c.title()} Audio:', f'\t{n_data}')
src = 'data'
dst = 'data/subset'
create_dataset(src, dst, classes, overwrite=True)
100%|██████████| 14/14 [00:01<00:00, 9.23it/s]
Total Train Ar Audio: 14 Total Train Fa Audio: 14 Total Train Hi Audio: 14 Total Train Id Audio: 11 Total Train Ja Audio: 14 Total Train Kn Audio: 14 Total Train My Audio: 14 Total Train Ne Audio: 14 Total Train Pa Audio: 14 Total Train Si Audio: 14 Total Train Ta Audio: 14 Total Train Th Audio: 14 Total Train Ur Audio: 14 Total Train Zh Audio: 14 Total Validation Ar Audio: 5 Total Validation Fa Audio: 5 Total Validation Hi Audio: 5 Total Validation Id Audio: 4 Total Validation Ja Audio: 5 Total Validation Kn Audio: 5 Total Validation My Audio: 5 Total Validation Ne Audio: 5 Total Validation Pa Audio: 5 Total Validation Si Audio: 5 Total Validation Ta Audio: 5 Total Validation Th Audio: 5 Total Validation Ur Audio: 5 Total Validation Zh Audio: 5 Total Test Ar Audio: 5 Total Test Fa Audio: 5 Total Test Hi Audio: 5 Total Test Id Audio: 4 Total Test Ja Audio: 5 Total Test Kn Audio: 5 Total Test My Audio: 5 Total Test Ne Audio: 5 Total Test Pa Audio: 5 Total Test Si Audio: 5 Total Test Ta Audio: 5 Total Test Th Audio: 5 Total Test Ur Audio: 5 Total Test Zh Audio: 5
# Initialize Directories
audio_path = Path('data/subset')
paths = {x: audio_path / x for x in ['train', 'validation', 'test']}
print(f'Training Dataset Directory: \t{paths["train"]}')
print(f'Validation Dataset Directory: \t{paths["validation"]}')
print(f'Test Dataset Directory: \t{paths["test"]}')
Training Dataset Directory: data/subset/train Validation Dataset Directory: data/subset/validation Test Dataset Directory: data/subset/test
def get_annotations(paths, classes=classes):
"""Parse audio files and get metadata"""
for i, (stage, path) in enumerate(paths.items()):
items = []
for j, c in tqdm(enumerate(classes)):
for audio in os.listdir(f'{path}/{c}'):
audio_path = f'{path}/{c}/{audio}'
items.append({
'path': audio_path,
'label': c,
'label_index': j,
})
df = pd.DataFrame(items)
df.to_csv(f'./{stage}.csv', header=False)
get_annotations(paths, classes=classes)
14it [00:00, 1952.72it/s] 14it [00:00, 2255.52it/s] 14it [00:00, 2230.76it/s]
The following code defines two classes: AudioDataset and AudioDataModule. Let's go through each class and understand their functionalities:
AudioDataset:Dataset class provided by the PyTorch library.__init__ method initializes the dataset object. It takes meta_data (a CSV file path) and num_frames as input. It reads the CSV file using pd.read_csv and stores it in self.meta_data.__len__ method returns the length of the dataset, which is the number of rows in self.meta_data.__getitem__ method is responsible for retrieving an item from the dataset given an index._get_audio_sample_path, _get_audio_sample_label, _get_audio_sample_label_index).ta.load and applies a mel-spectrogram transformation using transforms.MelSpectrogram from the torchvision library.
2. AudioDataModule:
LightningDataModule that handles the data loading and processing for the audio dataset.__init__ method initializes the data module object. It takes parameters such as batch_size, num_workers, and pin_memory.setup method is used to define the datasets for different stages (train, validation, test). It creates instances of the AudioDataset class for each stage.pad_sequence method pads the sequences in a batch with zeros to make them the same length.collate_fn method is a custom collate function used by the DataLoader. It gathers tensors, targets, and file paths from the batch and returns them in a formatted way.train_dataloader, val_dataloader, and test_dataloader methods return DataLoader objects for the respective stages, with appropriate settings such as batch size, shuffling, collate function, and number of workers.These classes can be used to handle audio data loading, preprocessing, and batching in a PyTorch-based deep learning project. The AudioDataModule provides an organized and standardized way to define and access data loaders for different stages of the training process.
class AudioDataset(Dataset):
def __init__(self, meta_data, num_frames=160_000):
self.num_frames = num_frames
self.meta_data = pd.read_csv(meta_data, header=None, index_col=0)
def __len__(self):
return len(self.meta_data)
def __getitem__(self, index):
# Edited
audio_sample_path = self._get_audio_sample_path(index)
label = self._get_audio_sample_label(index)
label_index = self._get_audio_sample_label_index(index)
signal, sr = ta.load(audio_sample_path, num_frames=self.num_frames)
transform = transforms.MelSpectrogram(sr, n_mels=40)
mfcc = transform(signal).squeeze()
return mfcc, label, label_index, signal, audio_sample_path
def _get_audio_sample_path(self, index):
path = self.meta_data.iloc[index, 0]
path = os.path.join(os.getcwd(),path)
return path
def _get_audio_sample_label(self, index):
return self.meta_data.iloc[index, 1]
def _get_audio_sample_label_index(self, index):
return self.meta_data.iloc[index, 2]
class AudioDataModule(pl.LightningDataModule):
def __init__(self, batch_size=256, num_workers=0, pin_memory=True):
super().__init__()
self.batch_size = batch_size
self.num_workers = num_workers
self.pin_memory = pin_memory
self.datasets = {}
self.dataloaders = {}
def setup(self, stage=None):
stages = ['train', 'validation', 'test']
# Define your datasets
self.datasets = {
# Edited
x: AudioDataset(f'{x}.csv')
for x in stages
}
def pad_sequence(self, batch):
# Make all tensors in a batch the same length by padding with zeros
batch = [item.t() for item in batch]
batch = torch.nn.utils.rnn.pad_sequence(batch,
batch_first=True,
padding_value=0.)
return batch.permute(0, 2, 1)
def collate_fn(self, batch):
tensors, targets, paths = [], [], []
# Gather tensors and encode labels as indices
for mel, _, label_index, _, filepath in batch:
tensors += [mel]
targets += [torch.tensor(label_index)]
paths += [filepath]
# Group the list of tensors into a batched tensor
tensors = self.pad_sequence(tensors)
targets = torch.stack(targets)
return tensors, targets, paths
def train_dataloader(self):
return DataLoader(
self.datasets['train'],
batch_size=self.batch_size,
shuffle=True,
collate_fn=self.collate_fn,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
)
def val_dataloader(self):
return DataLoader(
self.datasets['validation'],
batch_size=self.batch_size,
shuffle=False,
collate_fn=self.collate_fn,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
)
def test_dataloader(self):
return DataLoader(
self.datasets['test'],
batch_size=self.batch_size,
shuffle=False,
collate_fn=self.collate_fn,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
)
The next code defines a PyTorch Lightning module named SALITA. Let's go through its main components:
__init__ method:SALITA module with parameters such as num_classes, dataset, learning_rate, and metrics_threshold.
2. setup_model method:
Conv1d) followed by max pooling (MaxPool1d), and fully connected layers (Linear).num_classes.
3. setup_losses method:
CrossEntropyLoss) for multiclass classification.
4. forward method:
x is passed through the convolutional layers with ReLU activations and max pooling operations.log_softmax) to obtain class probabilities.
5. setup_metrics method:
SingleLabelMetrics initialized with the number of classes.
6. training_step, validation_step, and test_step methods:
7. configure_optimizers method:
Adam) with the specified learning rate.
8. on_test_epoch_end method:
The SALITA module is designed for audio classification tasks using a CNN architecture. It provides methods for training, validation, testing, and configuring the optimizer.
class SALITA(pl.LightningModule):
def __init__(self,
num_classes=14,
dataset="lang_data",
learning_rate=1e-5,
metrics_threshold=0.0):
super().__init__()
self.setup_model(num_classes)
self.setup_losses()
self.setup_metrics(num_classes=num_classes)
self.num_classes = num_classes
self.learning_rate = learning_rate
self.dataset = dataset
def setup_model(self, num_classes):
self.conv1 = nn.Conv1d(40, 64, kernel_size=3, stride=1)
self.conv2 = nn.Conv1d(64, 128, kernel_size=3, stride=1)
self.conv3 = nn.Conv1d(128, 256, kernel_size=3, stride=1)
self.conv4 = nn.Conv1d(256, 512, kernel_size=3, stride=1)
self.conv5 = nn.Conv1d(512, 1024, kernel_size=3, stride=1)
self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
self.fc1 = nn.Linear(1024 * 23, 512)
self.fc2 = nn.Linear(512, num_classes)
# Load parameters from .pth file
pretrained_file = "final_model_checkpoint.pth"
# pretrained_file = "/mnt/processed/private/msds2023/cpt8/ml3_project/saves/epoch10_model.pth" #Edited
if os.path.isfile(pretrained_file):
state_dict = torch.load(pretrained_file)
self.load_state_dict(state_dict)
def setup_losses(self):
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.pool(x)
x = self.conv2(x)
x = F.relu(x)
x = self.pool(x)
x = self.conv3(x)
x = F.relu(x)
x = self.pool(x)
x = self.conv4(x)
x = F.relu(x)
x = self.pool(x)
x = self.conv5(x)
x = F.relu(x)
x = self.pool(x)
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def setup_metrics(self, num_classes):
self.train_metrics = SingleLabelMetrics(num_classes=num_classes)
self.valid_metrics = SingleLabelMetrics(num_classes=num_classes)
self.test_metrics = SingleLabelMetrics(num_classes=num_classes)
def training_step(self, batch, batch_idx):
x, y, _ = batch
logits = self(x)
preds = torch.argmax(logits, dim=1)
accuracy = (preds == y).sum().item() / len(y)
loss = self.loss_fn(logits, y)
self.log('train_loss', loss)
self.log('train_accuracy', accuracy)
self.train_metrics(logits, y)
def validation_step(self, batch, batch_idx):
x, y, _ = batch
logits = self(x)
loss = self.loss_fn(logits, y)
self.log('val_loss', loss)
self.valid_metrics(logits, y)
def test_step(self, batch, batch_idx):
x, y, _ = batch
logits = self(x)
loss = self.loss_fn(logits, y)
# Calculate accuracy
preds = torch.argmax(logits, dim=1)
accuracy = (preds == y).sum().item() / len(y)
self.log('test_loss', loss)
self.log('test_accuracy', accuracy)
self.test_metrics(logits, y)
def configure_optimizers(self):
optimizer = Adam(self.parameters(), lr=self.learning_rate)
return optimizer
def on_test_epoch_end(self):
test_metrics = self.test_metrics.compute()
self.log('test_metrics', test_metrics.compute(), prog_bar=True)
self.test_metrics.save(model="classifier", classifier_type="SALITA",
dataset=self.dataset)
self.test_metrics.reset()
# Save the test metrics as instance attributes
self.test_metrics_results = test_metrics
batch_size = 8
if device.type == "cuda":
num_workers = 1
pin_memory = True
else:
num_workers = 0
pin_memory = False
data_module = AudioDataModule(batch_size=batch_size,
num_workers=num_workers,
pin_memory=pin_memory)
model = SALITA()
model.to(device)
SALITA( (conv1): Conv1d(40, 64, kernel_size=(3,), stride=(1,)) (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,)) (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,)) (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,)) (conv5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,)) (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (fc1): Linear(in_features=23552, out_features=512, bias=True) (fc2): Linear(in_features=512, out_features=14, bias=True) (loss_fn): CrossEntropyLoss() (train_metrics): SingleLabelMetrics() (valid_metrics): SingleLabelMetrics() (test_metrics): SingleLabelMetrics() )
The code snippet loads two pickled arrays from files: test_cm and train_cm. Pickling is a way to serialize Python objects into a binary format that can be easily stored and retrieved. In this case, the arrays were likely serialized and stored using the pickle module.
The first block of code opens the file named 'test_cm.pickle' in binary mode ('rb') and uses the pickle.load() function to deserialize and load the contents of the file into the test_cm variable. Similarly, the second block of code opens the file named 'train_cm.pickle' and loads its contents into the train_cm variable.
After executing these lines, test_cm and train_cm will contain the data that was previously serialized and stored in the pickle files.
# Unpickle the array
with open('test_cm.pickle', 'rb') as file:
test_cm = pickle.load(file)
# Unpickle the array
with open('train_cm.pickle', 'rb') as file:
train_cm = pickle.load(file)
The code snippet below creates a ConfusionMatrixDisplay object disp using the train_cm and test_cm arrays, which are confusion matrices of the train and test sets.
The plot() method of the disp object is then called to generate a plot of the confusion matrix. The plot visually represents the performance of a classification model by showing the counts or proportions of correct and incorrect predictions for each class.
Finally, plt.show() is called to display the generated plot. This function is typically used in conjunction with the Matplotlib library to show the figures or plots created using its plotting functions.
disp = ConfusionMatrixDisplay(train_cm)
fig, ax = plt.subplots(figsize=(10, 8))
disp.plot(cmap='BuPu', ax=ax)
plt.show()
disp = ConfusionMatrixDisplay(test_cm)
fig, ax = plt.subplots(figsize=(10, 8))
disp.plot(cmap='BuPu', ax=ax)
plt.show()
These are commands used to copy files and directories. Let's break them down:
!cp -r "./NN-Explainer/src/utils" .cp is a command in Unix-like systems used to copy files and directories.-r is an option that allows recursive copying, meaning it copies directories and their contents."./NN-Explainer/src/utils" specifies the source directory that we want to copy.. represents the current directory, indicating the destination where the files and directories will be copied to.This command copies the "utils" directory from the "NN-Explainer" repository's source directory to our current directory.
!cp -r "./NN-Explainer/src/models" .
!cp ./explainer_salita.py ./models
cp is used to copy a single file."./explainer_salita.py" is the source file we want to copy."./models" represents the destination directory where we want to copy the file.This command copies the file "explainer_salita.py" to the "models" directory.
Overall, these commands are used to copy directories and files from the "NN-Explainer" repository to the current directory, allowing us to use or modify them locally.
!cp -r "./NN-Explainer/src/utils" .
!cp -r "./NN-Explainer/src/models" .
!cp ./explainer_salita.py ./models
# !git clone https://github.com/stevenstalder/NN-Explainer.git
We initialize an ExplainerClassifierModel, set it to evaluation mode, prepare the necessary data module for testing, and create a directory to save the explainer model's masks based on the dataset, classifier type, and mode.
The code snippet performs the following steps:
ExplainerClassifierModel object named explainer and moves it to the specified device.explainer model to evaluation mode.AudioDataModule object named data_module.setup method of data_module with the stage set to "test", which sets up the test dataset.dataset variable as "lang_data".classifier_type variable as "SALITA".mode variable as "seg".save_path object as a Path based on the dataset, classifier type, and "explainer" mode.save_path exists, and if not, creates the directory using os.makedirs().explainer = ExplainerClassifierModel(classifier=model).to(device)
/opt/conda/lib/python3.10/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. warnings.warn( /opt/conda/lib/python3.10/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=None`. warnings.warn(msg)
explainer.eval()
ExplainerClassifierModel(
(explainer): Deeplabv3Resnet50ExplainerModel(
(explainer): DeepLabV3(
(backbone): IntermediateLayerGetter(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): Bottleneck(
(conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer2): Sequential(
(0): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer3): Sequential(
(0): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(4): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(5): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer4): Sequential(
(0): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
)
(classifier): DeepLabHead(
(0): ASPP(
(convs): ModuleList(
(0): Sequential(
(0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
)
(1): ASPPConv(
(0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(12, 12), dilation=(12, 12), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
)
(2): ASPPConv(
(0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(24, 24), dilation=(24, 24), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
)
(3): ASPPConv(
(0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(36, 36), dilation=(36, 36), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
)
(4): ASPPPooling(
(0): AdaptiveAvgPool2d(output_size=1)
(1): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): ReLU()
)
)
(project): Sequential(
(0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): Dropout(p=0.5, inplace=False)
)
)
(1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): ReLU()
(4): Conv2d(256, 14, kernel_size=(1, 1), stride=(1, 1))
)
)
)
(classifier): SALITA(
(conv1): Conv1d(40, 64, kernel_size=(3,), stride=(1,))
(conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,))
(conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
(conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
(conv5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,))
(pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(fc1): Linear(in_features=23552, out_features=512, bias=True)
(fc2): Linear(in_features=512, out_features=14, bias=True)
(loss_fn): CrossEntropyLoss()
(train_metrics): SingleLabelMetrics()
(valid_metrics): SingleLabelMetrics()
(test_metrics): SingleLabelMetrics()
)
(total_variation_conv): TotalVariationConv(
(variance_right_filter): Conv2d(1, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False, padding_mode=reflect)
(variance_down_filter): Conv2d(1, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False, padding_mode=reflect)
)
(classification_loss_fn): CrossEntropyLoss()
(train_metrics): SingleLabelMetrics()
(valid_metrics): SingleLabelMetrics()
(test_metrics): SingleLabelMetrics()
)
num_classes = 14
# data_base_path = '../../datasets/'
# data_path = Path(data_base_path) / "lang_data"
data_module = AudioDataModule()
data_module.setup(stage = "test")
dataset = "lang_data"
classifier_type = "SALITA"
mode = "seg"
save_path = Path('masks/{}_{}_{}/'.format(dataset, classifier_type, "explainer"))
if not os.path.isdir(save_path):
os.makedirs(save_path)
i2l_dict = {
0.0: 'HI',
1.0: 'NE',
2.0: 'TH',
3.0: 'SI',
4.0: 'JA',
5.0: 'PA',
6.0: 'AR',
7.0: 'TA',
8.0: 'KN',
9.0: 'FA',
10.0: 'MY',
11.0: 'ZH',
12.0: 'UR',
13.0: 'ID',
}
l2i_dict = {y: x for x, y in i2l_dict.items()}
audio_sample_path = 'Luffy_cut.wav'
label = 'JA'
label_index = l2i_dict[label]
num_frames = 160_000
n_mels = 40
signal, sr = ta.load(audio_sample_path, num_frames=num_frames)
transform = transforms.MelSpectrogram(sr, n_mels=n_mels)
mfcc = transform(signal).squeeze()
x = mfcc.repeat(3, 1, 1)
x = x.to(device)
y = torch.tensor(label_index).to(device)
filename = audio_sample_path.rsplit('/', 1)[-1]
_, _, mask, _, _ = explainer(x, y)
predict = explainer.classifier(x)
The plot_waveform function takes an audio waveform, slices or pads it to the desired length, and plots it using matplotlib. It provides a visual representation of the waveform with the specified title.
def plot_waveform(waveform, sr, num_frames, title="Waveform"):
waveform = waveform.numpy()
n_channels, n_frames = waveform.shape
if n_channels > 1:
waveform = waveform[:1]
if n_frames > num_frames:
padded = waveform[:,:num_frames]
else:
padded = np.zeros((1, num_frames))
padded[:, :num_frames] = waveform
time_axis = torch.arange(0, num_frames) / sr
figure, axes = plt.subplots(1, 1, figsize=(15, 6))
axes.plot(time_axis, padded[0], linewidth=1, c='k')
axes.axis('off')
figure.suptitle(title)
plt.show(block=False)
return padded
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
sns.heatmap(torch.log(x.view(40, 801, -1)[:, :, 0]+1e-3).cpu(),
cmap='PuRd',
cbar=False,
ax=ax[0])
sns.heatmap(mask.view(40, 801, -1)[:, :, 0].cpu(),
cmap='gray',
cbar=False,
ax=ax[1])
ax[0].axis('off')
ax[0].set_title('MFCC of the Sample Audio File')
ax[1].axis('off')
ax[1].set_title('Saliency Mask of the Sample Audio File')
toc.add_fig('MFCC Representation - Sample', width=100)
Let's go through each function:
erode(image, selem, n=1): Performs erosion on the input image using the structuring element selem. The erosion operation shrinks the bright regions in the image. It can be applied multiple times by specifying the parameter n. The function returns the eroded image.
dilate(image, selem, n=1): Performs dilation on the input image using the structuring element selem. The dilation operation expands the bright regions in the image. It can be applied multiple times by specifying the parameter n. The function returns the dilated image.
n_close(image, selem, n=1): Performs closing on the input image using the structuring element selem. Closing is the combination of dilation followed by erosion and is useful for closing small gaps or holes in the bright regions of the image. It can be applied multiple times by specifying the parameter n. The function returns the closed image.
n_open(image, selem, n=1): Performs opening on the input image using the structuring element selem. Opening is the combination of erosion followed by dilation and is useful for removing small bright regions or smoothing the edges of bright regions in the image. It can be applied multiple times by specifying the parameter n. The function returns the opened image.
plot_waveform(waveform, sr, num_frames, title="Waveform"): Takes an audio waveform represented by the waveform tensor, the sample rate sr, and the desired number of frames num_frames. It plots the waveform using matplotlib, ensuring that it has the specified number of frames. The resulting plot is displayed, and the padded waveform is returned.
colorFader(c1, c2, mix=0): Performs linear interpolation between two colors c1 and c2 based on the mix parameter (0 to 1). It returns the interpolated color in hexadecimal format.
from skimage.morphology import erosion, dilation, opening, closing
def erode(image, selem, n=1):
"""Perform erosion `n` times"""
for _ in range(n):
image = erosion(image, selem)
return image
def dilate(image, selem, n=1):
"""Perform dilation `n` times"""
for _ in range(n):
image = dilation(image, selem)
return image
def n_close(image, selem, n=1):
"""Perform dilation `n` times"""
for _ in range(n):
image = closing(image, selem)
return image
def n_open(image, selem, n=1):
"""Perform dilation `n` times"""
for _ in range(n):
image = opening(image, selem)
return image
def plot_waveform(waveform, sr, num_frames, title="Waveform"):
waveform = waveform.numpy()
n_channels, n_frames = waveform.shape
if n_channels > 1:
waveform = waveform[:1]
if n_frames > num_frames:
padded = waveform[:,:num_frames]
else:
padded = np.zeros((1, num_frames))
padded[:, :num_frames] = waveform
time_axis = torch.arange(0, num_frames) / sr
figure, axes = plt.subplots(1, 1, figsize=(15, 6))
axes.plot(time_axis, padded[0], linewidth=1, c='k')
axes.axis('off')
figure.suptitle(title)
toc.add_fig('Audio File Visualization')
return padded
def colorFader(c1,c2,mix=0): #fade (linear interpolate) from color c1 (at mix=0) to c2 (mix=1)
c1=np.array(mpl.colors.to_rgb(c1))
c2=np.array(mpl.colors.to_rgb(c2))
return mpl.colors.to_hex((1-mix)*c1 + mix*c2)
The next code snippet performs various operations on the input mask and waveform data. Here's a breakdown of the steps:
waveform, sr = ta.load(audio_sample_path): Loads an audio waveform from the specified file path using the ta.load function. The resulting waveform and sample rate are assigned to the variables waveform and sr, respectively.
plot_mask = mask + mask.min(): Adds the minimum value of the mask tensor to itself, effectively shifting the values to be non-negative.
plot_mask = (plot_mask.sum(1) / plot_mask.sum(1).max()).mean(0): Normalizes the plot_mask tensor by dividing each row by the maximum value across rows and then taking the mean along the 0th dimension.
thick_mask = (plot_mask > plot_mask.quantile(.75)).float(): Creates a binary mask tensor thick_mask by thresholding the plot_mask tensor at the 75th percentile value, converting it to a float tensor.
wave_mask = plot_mask[:-1].view(1, -1).t().repeat(1, 200).view(1, -1).cpu().numpy(): Extracts a subset of plot_mask excluding the last element, reshapes it, repeats it 200 times horizontally, reshapes it again, and converts it to a NumPy array. The resulting wave_mask has a modified shape for further processing.
closed_mask = erode(thick_mask.repeat(3, 1).cpu().numpy(), np.array([[0,0,0],[1,1,1],[0,0,0]]), 5): Performs erosion operation on the thick_mask tensor by repeatedly applying a 3x3 erosion structuring element. The erosion operation is performed 5 times. The resulting tensor is converted to a NumPy array.
closed_mask = dilate(closed_mask, np.array([[0,0,0],[1,1,1],[0,0,0]]), 5): Performs dilation operation on the closed_mask tensor by repeatedly applying a 3x3 dilation structuring element. The dilation operation is performed 5 times.
closed_mask = n_close(closed_mask, np.array([[0,0,0],[1,1,1],[0,0,0]]), 5): Performs closing operation on the closed_mask tensor by repeatedly applying a 3x3 structuring element. The closing operation is performed 5 times.
wave_top_mask = np.repeat(closed_mask[0, :-1].reshape(1, -1).T, 200, 1).reshape(1, -1): Reshapes the first row of the closed_mask tensor by excluding the last element, repeats it 200 times horizontally, and reshapes it again. The resulting wave_top_mask has a modified shape for further processing.
waveform, sr = ta.load(audio_sample_path)
padded_waveform = plot_waveform(waveform, sr, num_frames, title="Original waveform")
plot_mask = mask + mask.min()
plot_mask = (plot_mask.sum(1) / plot_mask.sum(1).max()).mean(0)
thick_mask = (plot_mask > plot_mask.quantile(.75)).float()
wave_mask = plot_mask[:-1].view(1, -1).t().repeat(1, 200).view(1, -1).cpu().numpy()
closed_mask = erode(thick_mask.repeat(3, 1).cpu().numpy(),
np.array([[0,0,0],
[1,1,1],
[0,0,0]]),
5)
closed_mask = dilate(closed_mask,
np.array([[0,0,0],
[1,1,1],
[0,0,0]]),
5)
closed_mask = n_close(closed_mask,
np.array([[0,0,0],
[1,1,1],
[0,0,0]]),
5)
wave_top_mask = np.repeat(closed_mask[0, :-1].reshape(1, -1).T, 200, 1).reshape(1, -1)
c1='white'
c2='purple'
n=801
fig, ax = plt.subplots(1, 2, figsize=(15, 2))
for i in range(n):
ax[1].axvline(i, color=colorFader(c1,c2,closed_mask[0, i]), linewidth=4)
time_axis = torch.arange(0, 160_000) / 200
for ai in ax:
ai.plot(time_axis, padded_waveform[0], linewidth=1, c='k')
ai.axis('off')
ax[0].set_title('Original Waveform')
ax[1].set_title('Waveform with Importance Gradients')
toc.add_fig('Sample Timestep Importance Identification', width=100)
Audio(padded_waveform, rate=sr)
Audio(padded_waveform*wave_top_mask, rate=sr)
In this section, you will find the references that were used to support the information presented in this study. These references include academic articles, books, reports, and other sources of information that were deemed relevant to the topic at hand. The references are listed in alphabetical order by author's last name, and follow the guidelines set out by the APA (American Psychological Association) style of referencing.
[1] Chouhan, D. (2023, May 12). Lang_data. [Dataset]. Kaggle. Retrieved from https://www.kaggle.com/datasets/shadowfax/lang-data
[2] Stalder, S., Perraudin, N., Achanta, R., Perez-Cruz, F., & Volpi, M. (2022). What You See is What You Classify: Black Box Attributions. In Advances in Neural Information Processing Systems 35 (NeurIPS 2022) Main Conference Track. Retrieved from https://proceedings.neurips.cc/paper_files/paper/2022/file/0073cc73e1873b35345209b50a3dab66-Paper-Conference.pdf
[3] Wang, Z. [UN ESCAP]. (2023, April 18). Multilingualism at the UN: Linguistic Diversity in the Asia-Pacific Region [Video file]. Retrieved from https://www.youtube.com/watch?v=221A6yWDRbE
[4] International Organization for Standardization. (2002). ISO 639-1:2002, Codes for the representation of names of languages — Part 1: Alpha-2 code. Retrieved from https://www.iso.org/standard/22109.html
[5] UN Economic and Social Commission for Asia and the Pacific. (n.d.). One UN, many voices: Why multilingualism matters. Retrieved from https://www.unescap.org/story/one-un-many-voices-why-multilingualism-matters
[6] Stalder, S. (n.d.). NN-Explainer. Retrieved from https://github.com/stevenstalder/NN-Explainer